In [1]:
import pandas as pd
import numpy as np
store = pd.read_csv('store.csv')
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
In [2]:
print "Lets look at training set-"
print train.head(6)
print "Lets look at stores data-"
print train.head(6)
print "Given number of rows for training =", train.shape[0]
print "Given number of rows for testing =", test.shape[0]
In [3]:
sf = pd.merge(train, store, on='Store') #merge data for analysis
print "List of columns -", sf.columns
sf['Open'] = sf['Open'].apply(lambda x: 0 if np.isnan(x) else x) #get rid of NaN values if store is 'closed'
In [4]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
In [5]:
#a better way to get more details, timeline view of medians of each day of the week-
day = sf[(sf['Open']!=0)]
sales_day = day.groupby('DayOfWeek')['Sales'].median()
cust_day = day.groupby('DayOfWeek')['Customers'].median()
#
fig, (axis1) = plt.subplots(1,1, sharex=True, figsize=(10,5))
# plot median sales
ax1 = sales_day.plot(legend=True, ax=axis1, marker='o',title="Fig. 1, Median sales of each day of the week")
ax1.set_xticks(sales_day.index)
tmp = ax1.set_xticklabels(sales_day.index.tolist(), rotation=90)
# overlay customer data
cust_day.plot(legend=True, ax=axis1, marker='x', secondary_y=True)
Out[5]:
In [6]:
#for more insights lets split Year-Month-Date to three different columns
def date_change(data):
data['Month'] = data['Date'].apply(lambda x : int(str(x)[5:7]))
data['Year'] = data['Date'].apply(lambda x : int(str(x)[:4]))
data['MonthYear'] = data['Date'].apply(lambda x : (str(x)[:7]))
data['date_int'] = data['Date'].apply(lambda x : (str(x)[8:10]))
data = data.drop('Date', axis=1)
return data
sf = date_change(sf)
test = date_change(test)
In [7]:
day = sf[(sf['Open']!=0)]
sales_day = day.groupby('date_int')['Sales'].median()
cust_day = day.groupby('date_int')['Customers'].median()
#
fig, (axis1) = plt.subplots(1,1, sharex=True, figsize=(10,5))
# plot median sales
ax1 = sales_day.plot(legend=True, ax=axis1, marker='o',title="Fig. 2: Sales vs Dates")
ax1.set_xticks(sales_day.index)
#ax1.set_yticks(date_int)
tmp = ax1.set_xticklabels(sales_day.index.tolist(), rotation=90)
# overlay customer data
cust_day.plot(legend=True, ax=axis1, marker='x', secondary_y=True)
Out[7]:
In [8]:
import calendar
# select all stores that were open
subs = sf[sf['Open']!=0]
# groupby Year and Month
selected_sales = subs.groupby(['Year', 'Month'])['Sales'].mean()
selected_cust = subs.groupby(['Year', 'Month'])['Customers'].mean()
# plot
fig, (axis1) = plt.subplots(1,1, figsize=(10,7))
selected_sales.unstack().T.plot(ax=axis1)
tmp = axis1.set_title("Fig. 3: Yearly Sales")
tmp = axis1.set_ylabel("Sales")
tmp = axis1.set_xticks(range(0,13))
tmp = axis1.set_xticklabels(calendar.month_abbr)
In [10]:
# median sales
median_sales = sf.groupby('MonthYear')['Sales'].median()
pct_median_change = sf.groupby('MonthYear')['Sales'].median().pct_change()
# median customers
median_cust = sf.groupby('MonthYear')['Customers'].median()
pct_median_custchange = sf.groupby('MonthYear')['Customers'].median().pct_change()
fig, (axis1, axis2) = plt.subplots(2, 1, sharex=True, figsize=(15,15))
# plot median sales
ax1 = median_sales.plot(legend=True, ax=axis1, marker='o',title="Fig. 4a: Monthly changes in Sales/Customers")
ax1.set_xticks(range(len(median_sales)))
ax1.set_xticklabels(median_sales.index.tolist(), rotation=90)
# plot pct change
ax2 = pct_median_change.plot(legend=True, ax=axis2, marker='o',rot=90, title="Fig. 4b: Percent Change")
# overlay customer data
median_cust.plot(legend=True, ax=axis1, marker='x', secondary_y=True)
pct_median_custchange.plot(legend=True, ax=axis2, marker='x', rot=90, secondary_y=True)
Out[10]:
In [11]:
fig, (axis3) = plt.subplots(1,1, sharex=True, figsize=(10,5))
# stateholiday overlay
# StateHoliday has a mixture of "O", 0 and "0", replace these with "O"
sf.StateHoliday.replace(["O",0,"0"],['O','O','O'], inplace=True)
ax3 = sns.countplot(x='MonthYear', hue='StateHoliday', data=sf[sf['StateHoliday']!='O'], palette='husl', ax=axis3)
tmp = ax3.set_xticklabels(ax3.get_xticklabels(), rotation=90)
tmp = ax3.set_title('Fig. 5: State Holidays')
In [12]:
fig, (axis4) = plt.subplots(1,1, sharex=True, figsize=(10,5))
ax4 = sns.countplot(x='MonthYear', hue='SchoolHoliday', data=sf[sf['SchoolHoliday']!=0], palette='husl', ax=axis4)
subs = sf[sf['SchoolHoliday']!=0]
#cntchange = subs.groupby('MonthYear')['SchoolHoliday'].count().pct_change()
#cntchange.plot(ax=axis4, marker='x', secondary_y=True)
tmp = ax4.set_xticklabels(ax4.get_xticklabels(), rotation=90)
tmp = ax4.set_title('Fig.6: School Holidays and Percentage Change in School Holidays')
In [13]:
fig, (axis5) = plt.subplots(1,1, sharex=True, figsize=(10,5))
#promotions overlay
ax5 = sns.countplot(x='MonthYear', hue='Promo', data=sf[sf["Promo"]!=0], palette='husl', ax=axis5)
subs = sf[sf['Promo']!=0]
cntchange = subs.groupby('MonthYear')['Promo'].count().pct_change()
cntchange.plot(ax=axis5, marker='x', secondary_y=True)
tmp = ax5.set_xticklabels(ax5.get_xticklabels(), rotation=90)
tmp = ax5.set_title('Fig. 7: Promotions and Percentage Change in Promotions')
In [ ]: